In [1]:
import plotly.express as px

primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]
In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('C:/Users/priya/Downloads/spam.csv', encoding='latin-1')

df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()
Out[2]:
target message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   target   5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
In [4]:
df.isnull().sum()
Out[4]:
target     0
message    0
dtype: int64
In [5]:
df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()
Out[5]:
target message message_len
0 ham Go until jurong point, crazy.. Available only ... 20
1 ham Ok lar... Joking wif u oni... 6
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28
3 ham U dun say so early hor... U c already then say... 11
4 ham Nah I don't think he goes to usf, he lives aro... 13
In [6]:
max(df['message_len'])
Out[6]:
171

EDA¶

In [7]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts
Out[7]:
array([4825,  747], dtype=int64)
In [8]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(
    x=['ham'],
    y=[balance_counts[0]],
    name='ham',
    text=[balance_counts[0]],
    textposition='auto',
    marker_color=primary_blue
))
fig.add_trace(go.Bar(
    x=['spam'],
    y=[balance_counts[1]],
    name='spam',
    text=[balance_counts[1]],
    textposition='auto',
    marker_color=primary_grey
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
In [9]:
ham_df = df[df['target'] == 'ham']['message_len'].value_counts().sort_index()
spam_df = df[df['target'] == 'spam']['message_len'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=ham_df.index,
    y=ham_df.values,
    name='ham',
    fill='tozeroy',
    marker_color=primary_blue,
))
fig.add_trace(go.Scatter(
    x=spam_df.index,
    y=spam_df.values,
    name='spam',
    fill='tozeroy',
    marker_color=primary_grey,
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.update_xaxes(range=[0, 70])
fig.show()

Data pre processing¶

In [10]:
import re
import pandas as pd
import string


# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
In [11]:
df['message_clean'] = df['message'].apply(clean_text)
df.head()
Out[11]:
target message message_len message_clean
0 ham Go until jurong point, crazy.. Available only ... 20 go until jurong point crazy available only in ...
1 ham Ok lar... Joking wif u oni... 6 ok lar joking wif u oni
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entry in a wkly comp to win fa cup final...
3 ham U dun say so early hor... U c already then say... 11 u dun say so early hor u c already then say
4 ham Nah I don't think he goes to usf, he lives aro... 13 nah i dont think he goes to usf he lives aroun...
In [12]:
pip install nltk
Requirement already satisfied: nltk in c:\users\priya\anaconda3\lib\site-packages (3.8.1)
Requirement already satisfied: click in c:\users\priya\anaconda3\lib\site-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in c:\users\priya\anaconda3\lib\site-packages (from nltk) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in c:\users\priya\anaconda3\lib\site-packages (from nltk) (2023.10.3)
Requirement already satisfied: tqdm in c:\users\priya\anaconda3\lib\site-packages (from nltk) (4.65.0)
Requirement already satisfied: colorama in c:\users\priya\anaconda3\lib\site-packages (from click->nltk) (0.4.6)
Note: you may need to restart the kernel to use updated packages.
In [13]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')  # This will download the stopwords dataset
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\priya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[13]:
True
In [14]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text
    
df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()
Out[14]:
target message message_len message_clean
0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazy available bugis n great ...
1 ham Ok lar... Joking wif u oni... 6 ok lar joking wif oni
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entry wkly comp win fa cup final tkts m...
3 ham U dun say so early hor... U c already then say... 11 dun say early hor already say
4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goes usf lives around though

Stemming¶

In [15]:
stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    return text
In [16]:
df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()
Out[16]:
target message message_len message_clean
0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world...
1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m...
3 ham U dun say so early hor... U c already then say... 11 dun say earli hor alreadi say
4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though
In [17]:
def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))
    
    return text
In [18]:
df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()
Out[18]:
target message message_len message_clean
0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world...
1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m...
3 ham U dun say so early hor... U c already then say... 11 dun say ear hor alreadi say
4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though

Target encoding¶

In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()
Out[19]:
target message message_len message_clean target_encoded
0 ham Go until jurong point, crazy.. Available only ... 20 go jurong point crazi avail bugi n great world... 0
1 ham Ok lar... Joking wif u oni... 6 ok lar joke wif oni 0
2 spam Free entry in 2 a wkly comp to win FA Cup fina... 28 free entri wkli comp win fa cup final tkts m... 1
3 ham U dun say so early hor... U c already then say... 11 dun say ear hor alreadi say 0
4 ham Nah I don't think he goes to usf, he lives aro... 13 nah dont think goe usf live around though 0

token visualization¶

In [20]:
import numpy as np 
In [21]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Path to the locally saved image
local_image_path = 'C:/Users/priya/Downloads/icons8-twitter-64.png'  # Update this path to where you saved the image

# Load the image into a numpy array
img = Image.open(local_image_path)
twitter_mask = np.array(img)

# Assuming 'df' and its column 'message_clean' are defined appropriately
wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'ham', 'message_clean']))

# Display the WordCloud
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
No description has been provided for this image
In [22]:
wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'spam', 'message_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for SPAM messages', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
No description has been provided for this image

vectorization¶

In [23]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['message_clean']
y = df['target_encoded']

print(len(x), len(y))
5572 5572
In [24]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
4179 4179
1393 1393
In [25]:
from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(x_train)
Out[25]:
CountVectorizer()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer()
In [26]:
# Use the trained to create a document-term matrix from train and test sets
x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

5.1 tunning count vectorizer

In [27]:
vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)

5.2 TF - IDF

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)

x_train_tfidf
Out[28]:
<4179x5684 sparse matrix of type '<class 'numpy.float64'>'
	with 32201 stored elements in Compressed Sparse Row format>

Glove

In [29]:
texts = df['message_clean']
target = df['target_encoded']
In [30]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
Out[30]:
6726
In [31]:
import tensorflow as tf 
sequences = [[1], [2, 3], [4, 5, 6]]
tf.keras.preprocessing.sequence.pad_sequences(
    sequences, maxlen=None, dtype='int32', padding='pre',
    truncating='pre', value=0.0
)
Out[31]:
array([[0, 0, 1],
       [0, 2, 3],
       [4, 5, 6]])
In [32]:
tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post')
Out[32]:
array([[1, 0, 0],
       [2, 3, 0],
       [4, 5, 6]])
In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the Tokenizer and fit it on the texts
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(texts)

def embed(corpus):
    return word_tokenizer.texts_to_sequences(corpus)

# Find the longest sentence by token length
longest_train = max(texts, key=lambda sentence: len(embed([sentence])[0]))
length_long_sentence = len(embed([longest_train])[0])

# Pad the sequences
train_padded_sentences = pad_sequences(
    embed(texts), 
    length_long_sentence, 
    padding='post'
)

print("Padded Sentences:\n", train_padded_sentences)
Padded Sentences:
 [[   2 3179  274 ...    0    0    0]
 [   8  236  527 ...    0    0    0]
 [   9  356  588 ...    0    0    0]
 ...
 [6724 1002 6725 ...    0    0    0]
 [ 138 1251 1603 ...    0    0    0]
 [1986  378  170 ...    0    0    0]]

Glove

In [34]:
embeddings_dictionary = dict()
embedding_dim = 100

# Load GloVe 100D embeddings
glove_path = 'C:/Users/priya/Downloads/glove.6B.100d.txt/glove.6B.100d.txt'
embeddings_index = {}

with open(glove_path, 'r', encoding='utf-8') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector = np.array(records[1:], dtype='float32')
        embeddings_index[word] = vector

print("Loaded {} word vectors.".format(len(embeddings_index)))
Loaded 400000 word vectors.
In [35]:
embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix
Out[35]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
In [36]:
import plotly.figure_factory as ff

x_axes = ['Ham', 'Spam']
y_axes =  ['Spam', 'Ham']

def conf_matrix(z, x=x_axes, y=y_axes):
    
    z = np.flip(z, 0)

    # change each element of z to type string for annotations
    z_text = [[str(y) for y in x] for x in z]

    # set up figure 
    fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

    # add title
    fig.update_layout(title_text='<b>Confusion matrix</b>',
                      xaxis = dict(title='Predicted value'),
                      yaxis = dict(title='Real value')
                     )

    # add colorbar
    fig['data'][0]['showscale'] = True
    
    return fig
In [37]:
# Create a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

# Train the model
nb.fit(x_train_dtm, y_train)
Out[37]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()

Navie Bayes DTM

In [38]:
# Make class anf probability predictions
y_pred_class = nb.predict(x_test_dtm)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]
In [39]:
# calculate accuracy of class predictions
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
0.9784637473079684
In [40]:
# Calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)
Out[40]:
0.974296765425861

Navies Bayes

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()), 
                 ('tfid', TfidfTransformer()),  
                 ('model', MultinomialNB())])
In [42]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)

print(metrics.accuracy_score(y_test, y_pred_class))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
0.9597989949748744

XG Boost

In [43]:
import xgboost as xgb

pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc',
        # colsample_bytree=0.8,
        # subsample=0.7,
        # min_child_weight=5,
    ))
])
In [44]:
# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
Train: 0.9830102895429529
Test: 0.9641062455132807

Lstm¶

In [45]:
print("Length of train_padded_sentences:", len(train_padded_sentences))
print("Length of target:", len(target))
Length of train_padded_sentences: 5572
Length of target: 5572
In [46]:
print("Type of embedding_matrix:", type(embedding_matrix))
print("Shape of embedding_matrix:", embedding_matrix.shape)
print("Type of length_long_sentence:", type(length_long_sentence))
print("Value of length_long_sentence:", length_long_sentence)
Type of embedding_matrix: <class 'numpy.ndarray'>
Shape of embedding_matrix: (6726, 100)
Type of length_long_sentence: <class 'int'>
Value of length_long_sentence: 80
In [47]:
from tensorflow.keras.layers import Embedding

# Minimal Embedding example
embedding_layer = Embedding(input_dim=6726, output_dim=100)
In [48]:
import tensorflow as tf
print(tf.__version__)
2.16.1
In [49]:
print(type(embedding_matrix))
print(embedding_matrix.shape)
<class 'numpy.ndarray'>
(6726, 100)
In [50]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

try:
    model = Sequential([
        Embedding(input_dim=6726, output_dim=100)  # Leave out input_length
    ])
    print("Embedding layer initialized successfully without input_length.")
except Exception as e:
    print("Failed to initialize Embedding layer without input_length:", str(e))
Embedding layer initialized successfully without input_length.
In [51]:
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.models import Model

input_layer = Input(shape=(8,))  # Directly specify input shape here
embedding_layer = Embedding(input_dim=6726, output_dim=100)(input_layer)  # Without input_length
model = Model(inputs=input_layer, outputs=embedding_layer)

try:
    model.summary()
    print("Model with Embedding layer initialized successfully.")
except Exception as e:
    print("Error in model setup:", str(e))
Model: "functional_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer)        │ (None, 8)              │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ embedding_2 (Embedding)         │ (None, 8, 100)         │       672,600 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 672,600 (2.57 MB)
 Trainable params: 672,600 (2.57 MB)
 Non-trainable params: 0 (0.00 B)
Model with Embedding layer initialized successfully.
In [52]:
try:
    # Test embedding layer creation in isolation
    embedding_layer = Embedding(input_dim=6726, output_dim=100)
    print("Embedding layer standalone initialized successfully.")
except Exception as e:
    print("Embedding layer standalone initialization failed:", str(e))
Embedding layer standalone initialized successfully.

NLP¶

In [53]:
df = pd.read_csv('C:/Users/priya/Downloads/train.csv', encoding="latin-1")
test_df = pd.read_csv('C:/Users/priya/Downloads/test.csv', encoding="latin-1")

df = df.dropna(how="any", axis=1)
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')))

df.head()
Out[53]:
id text target text_len
0 1 Our Deeds are the Reason of this #earthquake M... 1 13
1 4 Forest fire near La Ronge Sask. Canada 1 7
2 5 All residents asked to 'shelter in place' are ... 1 22
3 6 13,000 people receive #wildfires evacuation or... 1 9
4 7 Just got sent this photo from Ruby #Alaska as ... 1 17

eda¶

In [54]:
balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts
Out[54]:
array([4342, 3271], dtype=int64)
In [55]:
fig = go.Figure()
fig.add_trace(go.Bar(
    x=['Fake'],
    y=[balance_counts[0]],
    name='Fake',
    text=[balance_counts[0]],
    textposition='auto',
    marker_color=primary_blue
))
fig.add_trace(go.Bar(
    x=['Real disaster'],
    y=[balance_counts[1]],
    name='Real disaster',
    text=[balance_counts[1]],
    textposition='auto',
    marker_color=primary_grey
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Dataset distribution by target</span>'
)
fig.show()
In [56]:
disaster_df = df[df['target'] == 1]['text_len'].value_counts().sort_index()
fake_df = df[df['target'] == 0]['text_len'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=disaster_df.index,
    y=disaster_df.values,
    name='Real disaster',
    fill='tozeroy',
    marker_color=primary_blue,
))
fig.add_trace(go.Scatter(
    x=fake_df.index,
    y=fake_df.values,
    name='Fake',
    fill='tozeroy',
    marker_color=primary_grey,
))
fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Data Roles in Different Fields</span>'
)
fig.show()

data preprocessing¶

In [57]:
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)

# Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub(
        'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 
        '', 
        text
    )
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    text = remove_url(text)
    text = remove_emoji(text)
    text = remove_html(text)
    
    return text
In [58]:
# Test emoji removal
remove_emoji("Omg another Earthquake 😔😔")
Out[58]:
'Omg another Earthquake '
In [59]:
stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

stemmer = nltk.SnowballStemmer("english")

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords and Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' ') if word not in stop_words)

    return text
In [60]:
test_df['text_clean'] = test_df['text'].apply(preprocess_data)

df['text_clean'] = df['text'].apply(preprocess_data)
df.head()
Out[60]:
id text target text_len text_clean
0 1 Our Deeds are the Reason of this #earthquake M... 1 13 deed reason earthquak may allah forgiv us
1 4 Forest fire near La Ronge Sask. Canada 1 7 forest fire near la rong sask canada
2 5 All residents asked to 'shelter in place' are ... 1 22 resid ask shelter place notifi offic evacu she...
3 6 13,000 people receive #wildfires evacuation or... 1 9 peopl receiv wildfir evacu order california
4 7 Just got sent this photo from Ruby #Alaska as ... 1 17 got sent photo rubi alaska smoke wildfir pour ...

wordcloud¶

In [61]:
def create_corpus_df(tweet, target):
    corpus=[]
    
    for x in tweet[tweet['target']==target]['text_clean'].str.split():
        for i in x:
            corpus.append(i)
    return corpus
In [62]:
from collections import defaultdict

corpus_disaster_tweets = create_corpus_df(df, 1)  # Assuming create_corpus_df and df are defined elsewhere

dic = defaultdict(int)
for word in corpus_disaster_tweets:
    dic[word] += 1
        
top = sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]
print(top)
[('fire', 266), ('bomb', 179), ('kill', 158), ('news', 132), ('via', 121), ('flood', 120), ('disast', 116), ('california', 115), ('crash', 110), ('suicid', 110)]
In [63]:
local_image_path = 'C:/Users/priya/Downloads/icons8-twitter-64.png'  # Update this path to where you saved the image

# Load the image into a numpy array
img = Image.open(local_image_path)
twitter_mask = np.array(img)


wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 1, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Real Disaster tweets', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
No description has been provided for this image
In [64]:
corpus_disaster_tweets = create_corpus_df(df, 0)

dic=defaultdict(int)
for word in corpus_disaster_tweets:
    dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10]
top
Out[64]:
[('like', 306),
 ('get', 222),
 ('amp', 192),
 ('new', 168),
 ('go', 142),
 ('dont', 139),
 ('one', 134),
 ('bodi', 116),
 ('love', 115),
 ('bag', 108)]
In [65]:
wc = WordCloud(
    background_color='white', 
    max_words=200, 
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 0, 'text_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for Fake messages', 
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()
No description has been provided for this image

modeling¶

In [66]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
x = df['text_clean']
y = df['target']

# Split into train and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
5709 5709
1904 1904
In [67]:
pipe = Pipeline([
    ('bow', CountVectorizer()), 
    ('tfid', TfidfTransformer()),  
    ('model', xgb.XGBClassifier(
        use_label_encoder=False,
        eval_metric='auc',
    ))
])
from sklearn import metrics

# Fit the pipeline with the data
pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))
Train: 0.861096514275705
Test: 0.773109243697479

Glove-LSTM¶

In [68]:
train_tweets = df['text_clean'].values
test_tweets = test_df['text_clean'].values
train_target = df['target'].values
In [69]:
# Calculate the length of our vocabulary
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(train_tweets)

vocab_length = len(word_tokenizer.word_index) + 1
vocab_length
Out[69]:
13704
In [70]:
def show_metrics(pred_tag, y_test):
    print("F1-score: ", f1_score(pred_tag, y_test))
    print("Precision: ", precision_score(pred_tag, y_test))
    print("Recall: ", recall_score(pred_tag, y_test))
    print("Acuracy: ", accuracy_score(pred_tag, y_test))
    print("-"*50)
    print(classification_report(pred_tag, y_test))
    
def embed(corpus): 
    return word_tokenizer.texts_to_sequences(corpus)
In [71]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Assuming train_tweets and test_tweets are your datasets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tweets)  # Fit the tokenizer on training data
In [72]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)
test_sequences = tokenizer.texts_to_sequences(test_tweets)
In [73]:
longest_train = max(train_sequences, key=len)  # Find the longest sequence
length_long_sentence = len(longest_train)  # Get the length of the longest sequence
In [74]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_padded_sentences = pad_sequences(train_sequences, maxlen=length_long_sentence, padding='post')
test_padded_sentences = pad_sequences(test_sequences, maxlen=length_long_sentence, padding='post')
In [75]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tweets)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_tweets)
test_sequences = tokenizer.texts_to_sequences(test_tweets)

# Find the longest sequence and its length
longest_train = max(train_sequences, key=len)
length_long_sentence = len(longest_train)

# Pad sequences
train_padded_sentences = pad_sequences(train_sequences, maxlen=length_long_sentence, padding='post')
test_padded_sentences = pad_sequences(test_sequences, maxlen=length_long_sentence, padding='post')

print(train_padded_sentences)  # Print the padded training sentences
[[3635  467  201 ...    0    0    0]
 [ 136    2  106 ...    0    0    0]
 [1338  502 1807 ...    0    0    0]
 ...
 [ 448 1328    0 ...    0    0    0]
 [  28  162 2637 ...    0    0    0]
 [ 171   31  413 ...    0    0    0]]

Glove¶

In [76]:
embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        
embedding_matrix
Out[76]:
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
In [77]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences, 
    train_target, 
    test_size=0.25
)
In [81]:
from tensorflow.keras.layers import Embedding

try:
    # Test with only required parameters
    embedding_layer = Embedding(input_dim=6726, output_dim=100)
    print("Basic Embedding layer initialized successfully.")
except Exception as e:
    print("Failed at basic Embedding initialization:", str(e))
Basic Embedding layer initialized successfully.
In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, GlobalMaxPool1D, Dense, Dropout, BatchNormalization

def glove_lstm():
    try:
        model = Sequential()
        model.add(Embedding(input_dim=6726, output_dim=100, input_length=80, weights=[np.zeros((6726, 100))], trainable=False))
        model.add(Bidirectional(LSTM(80, return_sequences=True, recurrent_dropout=0.2)))
        model.add(GlobalMaxPool1D())
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        model.add(Dense(80, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(80, activation="relu"))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
        return model
    except Exception as e:
        print(f"An error occurred while creating the model: {str(e)}")
        return None
In [91]:
import matplotlib.pyplot as plt

def plot_learning_curves(history, metrics):
    plt.figure(figsize=(12, 6))
    for subplot, metric_list in enumerate(metrics, 1):
        plt.subplot(1, len(metrics), subplot)
        for metric in metric_list:
            plt.plot(history.history[metric], label=metric)
        plt.title(' and '.join(metric_list).title())
        plt.xlabel('Epochs')
        plt.ylabel('Value')
        plt.legend()
plt.show()
In [84]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(10, activation='relu', input_shape=(10,)),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# Simulating training data
import numpy as np
X_train = np.random.random((100, 10))
y_train = np.random.randint(2, size=(100, 1))
X_test = np.random.random((20, 10))
y_test = np.random.randint(2, size=(20, 1))

# Training the model
history = model.fit(
    X_train, 
    y_train, 
    epochs=10,
    batch_size=10,
    validation_data=(X_test, y_test)
)
Epoch 1/10
C:\Users\priya\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:88: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

10/10 ━━━━━━━━━━━━━━━━━━━━ 1s 19ms/step - accuracy: 0.5381 - loss: 0.6791 - val_accuracy: 0.3500 - val_loss: 0.8526
Epoch 2/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 8ms/step - accuracy: 0.6358 - loss: 0.6457 - val_accuracy: 0.3500 - val_loss: 0.8490
Epoch 3/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.5480 - loss: 0.7108 - val_accuracy: 0.4000 - val_loss: 0.8473
Epoch 4/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 6ms/step - accuracy: 0.6754 - loss: 0.6538 - val_accuracy: 0.4000 - val_loss: 0.8467
Epoch 5/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step - accuracy: 0.5631 - loss: 0.6921 - val_accuracy: 0.4000 - val_loss: 0.8471
Epoch 6/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 7ms/step - accuracy: 0.6259 - loss: 0.6683 - val_accuracy: 0.4000 - val_loss: 0.8442
Epoch 7/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6993 - loss: 0.6263 - val_accuracy: 0.4000 - val_loss: 0.8433
Epoch 8/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.6444 - loss: 0.6558 - val_accuracy: 0.4500 - val_loss: 0.8410
Epoch 9/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - accuracy: 0.6533 - loss: 0.6457 - val_accuracy: 0.4500 - val_loss: 0.8397
Epoch 10/10
10/10 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step - accuracy: 0.5741 - loss: 0.6572 - val_accuracy: 0.4500 - val_loss: 0.8397
In [88]:
from sklearn.metrics import f1_score, precision_score, recall_score

def show_metrics(pred_tag, y_test):
    print("F1-score: ", f1_score(y_test, pred_tag, average='macro'))  
    print("Precision: ", precision_score(y_test, pred_tag, average='macro')) 
    print("Recall: ", recall_score(y_test, pred_tag, average='macro')) 

show_metrics(pred_labels, y_test)
F1-score:  0.28571428571428575
Precision:  0.2
Recall:  0.5
C:\Users\priya\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [89]:
plot_learning_curves(history, [['loss', 'val_loss'], ['accuracy', 'val_accuracy']])
No description has been provided for this image
In [90]:
import numpy as np

# Predict probabilities for each class
preds = model.predict(X_test)

# Convert probabilities to class labels
pred_labels = np.argmax(preds, axis=1)

# Show metrics
show_metrics(pred_labels, y_test)
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step
F1-score:  0.28571428571428575
Precision:  0.2
Recall:  0.5
C:\Users\priya\anaconda3\Lib\site-packages\sklearn\metrics\_classification.py:1469: UndefinedMetricWarning:

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

In [ ]:
 
In [ ]: